CAS Logo Open main paper 🔗

2  Estimating the five fairness premiums

Objectives

Building on the simulations in Chapter 1, this section estimates the spectrum of fairness. It complements Section 4.3 of the main paper, linked from the supplement’s introduction. We recommend keeping the main paper nearby, as this supplement focuses exclusively on implementation.

Packages for this section
library(tidyverse)
library(latex2exp)
library(jsonlite)

## also setup python for optimal transport. 
library(reticulate)
get_python_env_path <- function(file_path = "python_env_path.txt") {
  if (!file.exists(file_path)) {
    stop("The file 'python_env_path.txt' is missing. Please create it and specify your Python environment path.")
  }
  
  # Read the file and extract the first non-comment, non-empty line
  env_path <- readLines(file_path, warn = FALSE)
  env_path <- env_path[!grepl("^#", env_path) & nzchar(env_path)]
  
  if (length(env_path) == 0) {
    stop("No valid Python environment path found in 'python_env_path.txt'. Please enter your path.")
  }
  
  return(trimws(env_path[1]))
}
tryCatch({
  python_env_path <- get_python_env_path()
  use_python(python_env_path, required = TRUE)
  message("Python environment successfully set to: ", python_env_path)
}, error = function(e) {
  stop(e$message)
})
Data for this section
sims <- fromJSON('simuls/train_scenarios.json')
valid <-  fromJSON('simuls/valid_scenarios.json')
test <- fromJSON('simuls/test_scenarios.json')

## For the experiment in section 5
sim_samples <- jsonlite::fromJSON('simuls/sim_study.json')
Functions and objects from past sections
levels_for_premiums <- c("mu_B", "mu_U", "mu_A", "mu_H", "mu_C")
labels_for_premiums <- c("$\\widehat{\\mu}^B$","$\\widehat{\\mu}^U$", "$\\widehat{\\mu}^A$", "$\\widehat{\\mu}^H$", "$\\widehat{\\mu}^C$")

## four ingredients for the 5 families of premiums
premium_generator <- function(best, pdx, maps_to_corr, marg){
  list("mu_B" = function(x1, x2, d){
    
    ## simple call of the best estimate model
    best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
  }, "mu_U" = function(x1, x2, d = NULL){
    
    ## Explicit inference : mu_U = E(mu_B|X)
    tab_best <- sapply(0:1, function(dl){
      best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1)))) 
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_best * tab_pdx) %>% apply(1, sum)
  }, "mu_A" = function(x1, x2, d = NULL){
    
    ## mu_A = E(mu_B) unconditional
    sapply(0:1, function(d){best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(d, length(x1))))*
       marg[d + 1]}) %>% apply(1, sum)
  }, "mu_H" = function(x1, x2, d = NULL){
    
    ## explicit inference of mu_C : mu_H = E(mu_C|X)
    tab_corr <- sapply(0:1, function(dl){
      sb <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      maps_to_corr(data.frame(mu_B = sb, D = dl))
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_corr * tab_pdx) %>% apply(1, sum)
  }, "mu_C" = function(x1, x2, d = NULL){
    
    ## mu_C = T^{d}(mu_B(x, d))
    mu_b <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
    maps_to_corr(data.frame(mu_B = mu_b, D = d))
  })
}

levels_for_quants <- c('proxy_vuln', 'risk_spread', 'fair_range', 'parity_cost')


quant_generator <- function(premiums){
  list('proxy_vuln' = function(x1, x2, d){
    premiums$mu_U(x1 = x1, x2 = x2) - 
      premiums$mu_A(x1 = x1, x2 = x2)
  },
  'risk_spread' = function(x1, x2, x3, d){
    to_minmax <- data.frame(risk1 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 1),
                            risk0 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 0))
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'fair_range' = function(x1, x2, d){
    to_minmax <- data.frame(mu_b = premiums$mu_B(x1 = x1, x2 = x2, 
                                           d = d),
                           mu_u = premiums$mu_U(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_a = premiums$mu_A(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_h = premiums$mu_H(x1 = x1, x2 = x2,
                                          d = NULL),
                           mu_c = premiums$mu_C(x1 = x1, x2 = x2, 
                                          d = d))
    
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'parity_cost' = function(x1, x2, d){
    premiums$mu_C(x1 = x1, x2 = x2,
              d = d) -
      premiums$mu_B(x1 = x1, x2 = x2, 
                d = d)
  })
}

the_CAS_colors <- c('#FED35D', '#F89708', '#205ED5', '#142345')

In this section, we provide an example of detailed methodology for estimating the benchmarks premiums from the spectrum of fairness defined in Section 4.1 of the main paper. It complements the estimation of Section 4.1.

In this paper, premiums are estimated using lightgbm algorithms from Ke et al. (2017), a package that supports common actuarial distributions (Tweedie, Poisson, Gamma, etc.) and was found to have excellent predictive performance compared to other boosting algorithm in Chevalier and Côté (2024). Given lightgbm’s flexibility, we advocate for careful variable preselection to:

We optimize hyperparameters with a validation set to prevent overfitting. Key hyperparameters in lightgbm are the number of trees (num_iterations), regularization (lambda_l1, lambda_l2) for complexity control, and subsampling percentages (feature_fraction, bagging_fraction) .

The data used to construct prices and the estimated spectrum should align to ensure comparability. In the Case Study of the main paper, we conduct a posteriori benchmarking, as the study period is historical. A posteriori assessment can reveal segments where the model misaligned from fairness pillars after the fact. In contrast, conducting benchmarking concurrently with the development of a new pricing model provides an estimate of how well the commercial price aligns with fairness pillars for future periods, though the true fairness implications of the commercial price will only become evident retrospectively.

Discretizing \(D\)

Subsequent steps involve numerous computations over all possible values of \(D\). When \(D\) is continuous, discretization can improve tractability. The discretized version should be used throughout this appendix, and the discretization function should be kept for future applications. For multivariate \(D\), one approach is to first discretize continuous components, then treat the vector as one categorical variable where each unique combination defines a category. Care should be taken to ensure a manageable number of categories, with sufficient representation in each.

Balancing benchmark premiums

Profit and expense loadings are important, but they must not reintroduce unfairness. To align premium benchmarks with expected profits, we multiply the premiums by a factor \(\delta_j \geq 1~\forall j \in \{B, U, A, H, C\}\) to globally balance all premiums to the level of the commercial price.

In what follows, we detail how we estimate each benchmark premium in our framework.

2.1 Best-estimate premium

The best estimate premium \(\widehat{\mu}^B(\mathbf{x}, d)\) serves as the anchor. Thus, its estimation is crucial for our framework. It provides the best predictor of the response variable \(Y\) when differentiating risks based on \((X, D)\). It can be derived from indicated rates or data-driven (what we do) estimates as detailed in Complement 5 of the main paper.

Training the data-driven best-estimate price
source('___lgb_best_estimate.R')

## clean the pred repo
unlink(file.path('preds', "*_best_estimate.json"))

# Define grid for hyperparameter optimization
  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_best_estimate_lightgbm_fun(list_data = list_df, 
                                 name = name,
                                 hyperparameter_grid = hyperparameter_grid)
})
Training the best-estimate price on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_best_estimate_lightgbm_fun(list_data = list_df,
                                 name = paste0('sim', idx),
                                 hyperparameter_grid = hyperparameter_grid_sims)
})
Best for scenario:  sim1  
hyperparam  1 / 1 
Best valid mse: 59.99265  
optimal ntree: 461  
Training time: 20.61846  sec. 
Best for scenario:  sim2  
hyperparam  1 / 1 
Best valid mse: 57.62633  
optimal ntree: 372  
Training time: 18.97754  sec. 
Best for scenario:  sim3  
hyperparam  1 / 1 
Best valid mse: 69.94087  
optimal ntree: 372  
Training time: 16.64535  sec. 
Best for scenario:  sim4  
hyperparam  1 / 1 
Best valid mse: 57.81305  
optimal ntree: 269  
Training time: 15.6649  sec. 
Best for scenario:  sim5  
hyperparam  1 / 1 
Best valid mse: 52.80083  
optimal ntree: 355  
Training time: 16.73688  sec. 
Best for scenario:  sim6  
hyperparam  1 / 1 
Best valid mse: 53.77866  
optimal ntree: 380  
Training time: 17.77471  sec. 
Best for scenario:  sim7  
hyperparam  1 / 1 
Best valid mse: 61.28811  
optimal ntree: 388  
Training time: 16.89485  sec. 
Best for scenario:  sim8  
hyperparam  1 / 1 
Best valid mse: 51.2984  
optimal ntree: 325  
Training time: 26.8786  sec. 
Best for scenario:  sim9  
hyperparam  1 / 1 
Best valid mse: 51.62046  
optimal ntree: 300  
Training time: 16.11736  sec. 
Best for scenario:  sim10  
hyperparam  1 / 1 
Best valid mse: 55.70607  
optimal ntree: 343  
Training time: 14.81962  sec. 
Best for scenario:  sim11  
hyperparam  1 / 1 
Best valid mse: 59.86937  
optimal ntree: 364  
Training time: 19.1446  sec. 
Best for scenario:  sim12  
hyperparam  1 / 1 
Best valid mse: 56.77101  
optimal ntree: 475  
Training time: 19.90032  sec. 
Best for scenario:  sim13  
hyperparam  1 / 1 
Best valid mse: 60.81931  
optimal ntree: 405  
Training time: 19.0869  sec. 
Best for scenario:  sim14  
hyperparam  1 / 1 
Best valid mse: 58.33509  
optimal ntree: 412  
Training time: 15.7115  sec. 
Best for scenario:  sim15  
hyperparam  1 / 1 
Best valid mse: 53.88607  
optimal ntree: 346  
Training time: 14.43535  sec. 
Best for scenario:  sim16  
hyperparam  1 / 1 
Best valid mse: 59.23596  
optimal ntree: 389  
Training time: 18.07781  sec. 
Best for scenario:  sim17  
hyperparam  1 / 1 
Best valid mse: 54.3871  
optimal ntree: 287  
Training time: 16.25946  sec. 
Best for scenario:  sim18  
hyperparam  1 / 1 
Best valid mse: 57.00337  
optimal ntree: 560  
Training time: 15.46862  sec. 
Best for scenario:  sim19  
hyperparam  1 / 1 
Best valid mse: 60.03734  
optimal ntree: 436  
Training time: 14.21177  sec. 
Best for scenario:  sim20  
hyperparam  1 / 1 
Best valid mse: 52.84898  
optimal ntree: 440  
Training time: 13.85778  sec. 
Best for scenario:  sim21  
hyperparam  1 / 1 
Best valid mse: 55.30225  
optimal ntree: 293  
Training time: 10.96385  sec. 
Best for scenario:  sim22  
hyperparam  1 / 1 
Best valid mse: 53.69389  
optimal ntree: 490  
Training time: 17.59354  sec. 
Best for scenario:  sim23  
hyperparam  1 / 1 
Best valid mse: 62.21743  
optimal ntree: 355  
Training time: 14.39058  sec. 
Best for scenario:  sim24  
hyperparam  1 / 1 
Best valid mse: 53.68421  
optimal ntree: 355  
Training time: 15.04168  sec. 
Best for scenario:  sim25  
hyperparam  1 / 1 
Best valid mse: 60.09094  
optimal ntree: 329  
Training time: 11.58904  sec. 
Best for scenario:  sim26  
hyperparam  1 / 1 
Best valid mse: 47.60288  
optimal ntree: 292  
Training time: 13.49958  sec. 
Best for scenario:  sim27  
hyperparam  1 / 1 
Best valid mse: 60.22015  
optimal ntree: 406  
Training time: 17.03379  sec. 
Best for scenario:  sim28  
hyperparam  1 / 1 
Best valid mse: 59.61174  
optimal ntree: 305  
Training time: 12.77432  sec. 
Best for scenario:  sim29  
hyperparam  1 / 1 
Best valid mse: 55.34413  
optimal ntree: 304  
Training time: 11.61683  sec. 
Best for scenario:  sim30  
hyperparam  1 / 1 
Best valid mse: 64.67445  
optimal ntree: 384  
Training time: 20.92079  sec. 
Best for scenario:  sim31  
hyperparam  1 / 1 
Best valid mse: 47.12717  
optimal ntree: 477  
Training time: 18.44887  sec. 
Best for scenario:  sim32  
hyperparam  1 / 1 
Best valid mse: 48.75106  
optimal ntree: 326  
Training time: 13.19456  sec. 
Best for scenario:  sim33  
hyperparam  1 / 1 
Best valid mse: 54.51144  
optimal ntree: 332  
Training time: 12.65162  sec. 
Best for scenario:  sim34  
hyperparam  1 / 1 
Best valid mse: 55.22692  
optimal ntree: 406  
Training time: 14.61173  sec. 
Best for scenario:  sim35  
hyperparam  1 / 1 
Best valid mse: 57.43479  
optimal ntree: 318  
Training time: 14.63772  sec. 
Best for scenario:  sim36  
hyperparam  1 / 1 
Best valid mse: 60.98215  
optimal ntree: 428  
Training time: 16.03957  sec. 
Best for scenario:  sim37  
hyperparam  1 / 1 
Best valid mse: 57.51285  
optimal ntree: 331  
Training time: 12.74145  sec. 
Best for scenario:  sim38  
hyperparam  1 / 1 
Best valid mse: 57.96803  
optimal ntree: 383  
Training time: 13.28582  sec. 
Best for scenario:  sim39  
hyperparam  1 / 1 
Best valid mse: 53.05994  
optimal ntree: 290  
Training time: 10.56849  sec. 
Best for scenario:  sim40  
hyperparam  1 / 1 
Best valid mse: 53.19274  
optimal ntree: 404  
Training time: 14.10217  sec. 
Best for scenario:  sim41  
hyperparam  1 / 1 
Best valid mse: 58.2642  
optimal ntree: 368  
Training time: 13.9081  sec. 
Best for scenario:  sim42  
hyperparam  1 / 1 
Best valid mse: 54.46977  
optimal ntree: 329  
Training time: 11.90766  sec. 
Best for scenario:  sim43  
hyperparam  1 / 1 
Best valid mse: 54.93803  
optimal ntree: 324  
Training time: 14.05352  sec. 
Best for scenario:  sim44  
hyperparam  1 / 1 
Best valid mse: 52.68502  
optimal ntree: 273  
Training time: 12.74424  sec. 
Best for scenario:  sim45  
hyperparam  1 / 1 
Best valid mse: 59.00814  
optimal ntree: 445  
Training time: 16.61344  sec. 
Best for scenario:  sim46  
hyperparam  1 / 1 
Best valid mse: 52.3815  
optimal ntree: 401  
Training time: 13.94766  sec. 
Best for scenario:  sim47  
hyperparam  1 / 1 
Best valid mse: 56.59985  
optimal ntree: 333  
Training time: 16.76318  sec. 
Best for scenario:  sim48  
hyperparam  1 / 1 
Best valid mse: 51.65326  
optimal ntree: 333  
Training time: 14.19517  sec. 
Best for scenario:  sim49  
hyperparam  1 / 1 
Best valid mse: 55.94536  
optimal ntree: 360  
Training time: 15.0027  sec. 
Best for scenario:  sim50  
hyperparam  1 / 1 
Best valid mse: 52.62897  
optimal ntree: 377  
Training time: 12.38816  sec. 
Best for scenario:  sim51  
hyperparam  1 / 1 
Best valid mse: 64.34462  
optimal ntree: 409  
Training time: 13.89119  sec. 
Best for scenario:  sim52  
hyperparam  1 / 1 
Best valid mse: 60.50832  
optimal ntree: 375  
Training time: 11.95199  sec. 
Best for scenario:  sim53  
hyperparam  1 / 1 
Best valid mse: 54.41756  
optimal ntree: 414  
Training time: 14.2317  sec. 
Best for scenario:  sim54  
hyperparam  1 / 1 
Best valid mse: 58.74899  
optimal ntree: 386  
Training time: 15.13847  sec. 
Best for scenario:  sim55  
hyperparam  1 / 1 
Best valid mse: 54.77497  
optimal ntree: 457  
Training time: 14.23579  sec. 
Best for scenario:  sim56  
hyperparam  1 / 1 
Best valid mse: 60.44314  
optimal ntree: 550  
Training time: 17.88683  sec. 
Best for scenario:  sim57  
hyperparam  1 / 1 
Best valid mse: 46.662  
optimal ntree: 350  
Training time: 14.12562  sec. 
Best for scenario:  sim58  
hyperparam  1 / 1 
Best valid mse: 54.61111  
optimal ntree: 333  
Training time: 13.26824  sec. 
Best for scenario:  sim59  
hyperparam  1 / 1 
Best valid mse: 62.99965  
optimal ntree: 340  
Training time: 11.2711  sec. 
Best for scenario:  sim60  
hyperparam  1 / 1 
Best valid mse: 57.83195  
optimal ntree: 345  
Training time: 14.03413  sec. 
Best for scenario:  sim61  
hyperparam  1 / 1 
Best valid mse: 55.84609  
optimal ntree: 406  
Training time: 13.52504  sec. 
Best for scenario:  sim62  
hyperparam  1 / 1 
Best valid mse: 61.38334  
optimal ntree: 434  
Training time: 16.00593  sec. 
Best for scenario:  sim63  
hyperparam  1 / 1 
Best valid mse: 58.37945  
optimal ntree: 382  
Training time: 29.59574  sec. 
Best for scenario:  sim64  
hyperparam  1 / 1 
Best valid mse: 52.56473  
optimal ntree: 465  
Training time: 24.14801  sec. 
Best for scenario:  sim65  
hyperparam  1 / 1 
Best valid mse: 52.37869  
optimal ntree: 294  
Training time: 19.35547  sec. 
Best for scenario:  sim66  
hyperparam  1 / 1 
Best valid mse: 56.72573  
optimal ntree: 320  
Training time: 18.32182  sec. 
Best for scenario:  sim67  
hyperparam  1 / 1 
Best valid mse: 53.39241  
optimal ntree: 370  
Training time: 21.57507  sec. 
Best for scenario:  sim68  
hyperparam  1 / 1 
Best valid mse: 50.64998  
optimal ntree: 318  
Training time: 18.76665  sec. 
Best for scenario:  sim69  
hyperparam  1 / 1 
Best valid mse: 60.15548  
optimal ntree: 332  
Training time: 16.28578  sec. 
Best for scenario:  sim70  
hyperparam  1 / 1 
Best valid mse: 53.1135  
optimal ntree: 269  
Training time: 12.49678  sec. 
Best for scenario:  sim71  
hyperparam  1 / 1 
Best valid mse: 56.09982  
optimal ntree: 498  
Training time: 16.10272  sec. 
Best for scenario:  sim72  
hyperparam  1 / 1 
Best valid mse: 50.88737  
optimal ntree: 358  
Training time: 15.29152  sec. 
Best for scenario:  sim73  
hyperparam  1 / 1 
Best valid mse: 54.57941  
optimal ntree: 588  
Training time: 18.47234  sec. 
Best for scenario:  sim74  
hyperparam  1 / 1 
Best valid mse: 59.95548  
optimal ntree: 333  
Training time: 18.02597  sec. 
Best for scenario:  sim75  
hyperparam  1 / 1 
Best valid mse: 53.30214  
optimal ntree: 333  
Training time: 13.92138  sec. 
Best for scenario:  sim76  
hyperparam  1 / 1 
Best valid mse: 53.82538  
optimal ntree: 454  
Training time: 15.24619  sec. 
Best for scenario:  sim77  
hyperparam  1 / 1 
Best valid mse: 47.1502  
optimal ntree: 360  
Training time: 13.22314  sec. 
Best for scenario:  sim78  
hyperparam  1 / 1 
Best valid mse: 56.06046  
optimal ntree: 334  
Training time: 13.21804  sec. 
Best for scenario:  sim79  
hyperparam  1 / 1 
Best valid mse: 53.47778  
optimal ntree: 511  
Training time: 14.08862  sec. 
Best for scenario:  sim80  
hyperparam  1 / 1 
Best valid mse: 57.72975  
optimal ntree: 235  
Training time: 8.897715  sec. 
Best for scenario:  sim81  
hyperparam  1 / 1 
Best valid mse: 51.53007  
optimal ntree: 354  
Training time: 12.66523  sec. 
Best for scenario:  sim82  
hyperparam  1 / 1 
Best valid mse: 60.50198  
optimal ntree: 408  
Training time: 13.60901  sec. 
Best for scenario:  sim83  
hyperparam  1 / 1 
Best valid mse: 54.75599  
optimal ntree: 438  
Training time: 14.41076  sec. 
Best for scenario:  sim84  
hyperparam  1 / 1 
Best valid mse: 54.58065  
optimal ntree: 294  
Training time: 11.6102  sec. 
Best for scenario:  sim85  
hyperparam  1 / 1 
Best valid mse: 57.76709  
optimal ntree: 324  
Training time: 12.04841  sec. 
Best for scenario:  sim86  
hyperparam  1 / 1 
Best valid mse: 48.01183  
optimal ntree: 349  
Training time: 13.19689  sec. 
Best for scenario:  sim87  
hyperparam  1 / 1 
Best valid mse: 56.44882  
optimal ntree: 310  
Training time: 12.01591  sec. 
Best for scenario:  sim88  
hyperparam  1 / 1 
Best valid mse: 51.21924  
optimal ntree: 346  
Training time: 12.31097  sec. 
Best for scenario:  sim89  
hyperparam  1 / 1 
Best valid mse: 48.90171  
optimal ntree: 365  
Training time: 14.36979  sec. 
Best for scenario:  sim90  
hyperparam  1 / 1 
Best valid mse: 59.08503  
optimal ntree: 406  
Training time: 12.65752  sec. 
Best for scenario:  sim91  
hyperparam  1 / 1 
Best valid mse: 53.47507  
optimal ntree: 414  
Training time: 15.719  sec. 
Best for scenario:  sim92  
hyperparam  1 / 1 
Best valid mse: 53.84962  
optimal ntree: 427  
Training time: 13.87975  sec. 
Best for scenario:  sim93  
hyperparam  1 / 1 
Best valid mse: 53.60011  
optimal ntree: 302  
Training time: 14.79754  sec. 
Best for scenario:  sim94  
hyperparam  1 / 1 
Best valid mse: 62.12538  
optimal ntree: 310  
Training time: 12.46992  sec. 
Best for scenario:  sim95  
hyperparam  1 / 1 
Best valid mse: 51.89604  
optimal ntree: 945  
Training time: 18.74459  sec. 
Best for scenario:  sim96  
hyperparam  1 / 1 
Best valid mse: 55.16407  
optimal ntree: 439  
Training time: 14.09114  sec. 
Best for scenario:  sim97  
hyperparam  1 / 1 
Best valid mse: 46.14099  
optimal ntree: 371  
Training time: 13.44467  sec. 
Best for scenario:  sim98  
hyperparam  1 / 1 
Best valid mse: 58.09579  
optimal ntree: 588  
Training time: 19.54331  sec. 
Best for scenario:  sim99  
hyperparam  1 / 1 
Best valid mse: 49.00257  
optimal ntree: 345  
Training time: 12.93805  sec. 
Best for scenario:  sim100  
hyperparam  1 / 1 
Best valid mse: 47.13216  
optimal ntree: 576  
Training time: 17.67981  sec. 

2.2 Unaware premium

The unaware price, \(\mu^U(\mathbf{x})\), excludes direct use of \(D\). It is defined as the best predictor of \(\widehat{\mu}^B(\mathbf{X}, D)\) given \(\mathbf{x}\):
\[\begin{equation*} \widehat{\mu}^U(\mathbf{x}) = E\{\widehat{\mu}^B(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
This maintains consistency with the best estimate premium, whether \(\widehat{\mu}^B\) is data-driven or based on indicated rates (Complement 5 of the main paper). One can use a to predict \(\widehat{\mu}^B(\mathbf{X}, D)\) based on \(\mathbf{X}\). The loss function defaults to mean squared error as it is a reasonable distributional assumption for this response variable.

Alternatively (what we do), one can estimate the propensity and explicitly weight the best-estimate premiums based on predicted propensity. This also keeps consistency with the best-estimate.

Estimating the propensity function
source('___lgb_pdx_estimate.R') 

## clean the preds repo
unlink(file.path('preds', "*_pdx.json"))

  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_pdx_lightgbm_fun(list_data = list_df, name = name,
                       hyperparameter_grid = hyperparameter_grid)
})
Pdx for scenario:  Scenario1  
hyperparam  1 / 1 
Best valid binary logloss: 0.5789582  
optimal ntree: 392  
Training time: 30.01124  sec. 
Pdx for scenario:  Scenario2  
hyperparam  1 / 1 
Best valid binary logloss: 0.5846292  
optimal ntree: 414  
Training time: 30.69919  sec. 
Pdx for scenario:  Scenario3  
hyperparam  1 / 1 
Best valid binary logloss: 0.6313558  
optimal ntree: 519  
Training time: 33.90139  sec. 
Training the propensity function on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_pdx_lightgbm_fun(list_data = list_df, name = paste0('sim', idx),
                       hyperparameter_grid = hyperparameter_grid_sims)
})

2.3 Aware premium

The aware premium, \(\widehat{\mu}^A(\mathbf{x})\), requires knowledge of the marginal distribution of \(D\). The aware price, a particular case of the ``discrimination-free’’ price of Lindholm et al. (2022), is computed as:
\[\begin{equation*} \widehat{\mu}^A(\mathbf{x}) = \sum_{d\in \mathcal{D}} \widehat{\mu}^B(\mathbf{x}, d) \widehat{\Pr}(D = d). \end{equation*}\] As discussed earlier, we assume \(D\) is discrete or discretized. If the training sample is representative of the target population (portfolio, market, or region?), empirical proportions are consistent estimators of \(\widehat{\Pr}(D = d)\). This is also an estimator suggested in Section 5 of Lindholm et al. (2022).

Computing the empirical proportions for protected supopulations
marg_dist <- sims %>% lapply(function(data){
  data$D %>% table %>%  prop.table
})

## for later use
saveRDS(marg_dist, 'preds/the_empirical_proportions.rds')

## Computing the empirical proportions for protected supopulations on experimental data (for later use in section 5)
marg_dist_sims <- seq_along(sim_samples) %>% lapply(function(idx){
  sim_samples[[idx]]$train$D %>% table %>%  prop.table
})
Note

This method generalizes the use of \(D\) as a control variable to prevent distortions in estimated effect of \(\mathbf{X}\) due to omitted variable bias. It applies to all model types and has a causal interpretation under the assumptions that: (i) possible values of \(\mathbf{X}\) are observed across all groups of \(D\) (no extrapolation), (ii) \(D\) causes both \(\mathbf{X}\) and \(Y\), and (iii) there are no relevant unobserved variables. Under the same assumptions, other methods are estimators of the aware premium. One example is inverse probability weighting, which re-weights observations to (artificially) remove the association between \(\mathbf{X}\) and \(D\), preventing proxy effects.

2.4 Corrective premium

We estimate the corrective premium, \(\widehat{\mu}^C(\mathbf{x}, d)\), designed to enforce strong demographic parity by aligning premium distributions across protected groups. Various methods can achieve this, but we recommend the optimal transport approach of Hu, Ratz, and Charpentier (2024). Its advantage lies in modifying \(\widehat{\mu^B}(\mathbf{x}, d)\) while keeping the overall spectrum estimation anchored to the initial best-estimate premium. As an incremental adjustment, it minimizes modeling complexity while enforcing demographic parity, yielding a simple estimator for the corrective premium.

The corrective premium is computed by training a transport function that shifts best-estimate premiums for each protected group toward a common barycenter, ensuring demographic parity through corrective direct discrimination. See Hu, Ratz, and Charpentier (2024) for details. This optimal transport method is implemented in Python via the Equipy package of Fernandes Machado et al. (2025). The following chunk illustrates how this method, despite its complexity, translates into concise Python code.

Listing 2.1: Illustrative Python code for wasserstein transport toward corrective premiums.
import numpy as np
import pandas as pd
from equipy.fairness import FairWasserstein

# Load best-estimate premiums and associated sensitive attributes
best_est_train = pd.read_csv('best_est_prem_train.csv')  # Training premiums
sens_train = pd.read_csv('sensitive_train.csv')  # discrete sensitive data

# Train Fair Wasserstein transport to adjust premiums
barycenter = FairWasserstein()
barycenter.fit(best_est.values, sens.values)

# Load best-estimate premiums and associated sensitive attributes
best_est_test = pd.read_csv('best_est_prem_test.csv')  # Training premiums
sens_test = pd.read_csv('sensitive_test.csv')  # discrete sensitive data

# Apply transformation to obtain fairness-adjusted premiums
corrective_premiums_test = barycenter.transform(best_est_test.values, sens_test.values)

# Save results
pd.DataFrame(corrective_premiums).to_csv('corr_prem_test.csv', index=False)
Optimal transport and wasserstein distance : technical details

ICI ARTHUR?

Computing the optimal transport mapping for the scenarios
source_python("___opt_transp.py")

## now, the folder 'transported' exist, with one epsilon_y file per scenario

To predict corrective premium on other datasets, one can alternatively train a lightgbm model of mapping from \((X, D)\) to the resulting corrective premiums from Equipy on the training sample.

Computing the optimal transport mapping for the scenarios
source('___lgb_mapping_to_corr.R') 

corr_mapping_lgb <- setNames(nm = names(sims)) %>%
  lapply(the_mapping_to_corr_lightgbm_fun)
Mapping to corr. for scenario:  Scenario1  
Data import: 0.8731852  sec. 
Data conversion: 0.009803057  sec. 
Lgb training : 23.66387  sec. 
Mapping to corr. for scenario:  Scenario2  
Data import: 0.335748  sec. 
Data conversion: 0.016119  sec. 
Lgb training : 24.49672  sec. 
Mapping to corr. for scenario:  Scenario3  
Data import: 0.3372579  sec. 
Data conversion: 0.01687694  sec. 
Lgb training : 17.84801  sec. 

2.5 Hyperaware premium

The hyperaware premium, \(\widehat{\mu}^H(\mathbf{x})\), is the best non-directly discriminatory approximation of the corrective premium. We construct a lightgbm using only \(\mathbf{X}\) to predict the corrective premium, aiming at demographic parity without direct reliance on \(D\):
\[\begin{equation*} \widehat{\mu}^H(\mathbf{x}) = E\{\widehat{\mu}^C(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
Since the response variable is a premium, MSE is a correct choice of loss function in most cases. Just as for the unaware premium, one can explicitly aggregate corrective premiums across protected groups using estimated propensities as weights. This is what we do here.

2.6 Visualization the estimated spectrum

By combining all the trained models as component into a trained spectrum of fairness, we can define the premium function that will serve to predict the premiums

Defining the estimated premium and metrics functions
premiums <- setNames(nm = names(sims)) %>% lapply(function(name){
  premium_generator(best = best_lgb[[name]]$pred_fun, 
                  pdx = pdx_lgb[[name]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[[name]], 
                  marg = marg_dist[[name]])
})

quants <- setNames(nm = names(sims)) %>% lapply(function(name){
  quant_generator(premiums = premiums[[name]])
})


## For experimental data (future use in section 5)
premiums_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  premium_generator(best = best_sims[[idx]]$pred_fun, 
                  pdx = pdx_sims[[idx]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[['Scenario1']], ## We put anything, won't be used anyway as we focus on proxy vulnerabiltiy for section 5. 
                  marg = marg_dist_sims[[idx]])
})

quants_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  quant_generator(premiums = premiums_sims[[idx]])
})
Computation of estimated premiums and local metrics across the grid
df_to_g_file <- "preds/df_to_g.json"

# Check if the JSON file exists
if (file.exists(df_to_g_file)) {
  message(sprintf("[%s] File exists. Reading df_to_g from %s", Sys.time(), df_to_g_file))
  df_to_g <- fromJSON(df_to_g_file)
} else {

## From the first section of the online supplement
preds_grid_stats_theo <- fromJSON('preds/preds_grid_stats_theo.json')
  
df_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  local_scenario_df <- preds_grid_stats_theo[[name]]
  
  # Start time for this scenario
  start_time <- Sys.time()
  
  # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = local_scenario_df$x1,
        x2 = local_scenario_df$x2,
        d = local_scenario_df$d
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = local_scenario_df$x1,
      X2 = local_scenario_df$x2,
      D = local_scenario_df$d
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    local_scenario_df,
    premium_results,
    pdx = pdx_results
  )
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving df_to_g to %s", Sys.time(), df_to_g_file))
  toJSON(df_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(df_to_g_file)
  rm(df_to_g_theo)
}

grid_stats_path <- 'preds/preds_grid_stats.json'

# Check and load or compute preds_grid_stats
if (file.exists(grid_stats_path)) {
  preds_grid_stats <- fromJSON(grid_stats_path)
} else {
  preds_grid_stats <- setNames(nm = names(df_to_g)) %>% 
    lapply(function(name) {
      data.frame(df_to_g[[name]], 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = df_to_g[[name]]$x1,
                                             x2 = df_to_g[[name]]$x2,
                                             d = df_to_g[[name]]$d)
                       }))
    })
  toJSON(preds_grid_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(grid_stats_path)
}
Computing estimated premiums and local metrics for the simulations
pop_to_g_file <- "preds/pop_to_g.json"

# Check if the JSON file exists
if (file.exists(pop_to_g_file)) {
  message(sprintf("[%s] File exists. Reading pop_to_g from %s", Sys.time(), pop_to_g_file))
  pop_to_g <- fromJSON(pop_to_g_file)
} else {
## From the first section of the online supplement
preds_pop_stats_theo <- fromJSON('preds/preds_pop_stats_theo.json')
  
pop_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  # Start time for this simulation
  start_time <- Sys.time()
  
  list_data <- list('train' = sims[[name]],
                    'valid' = valid[[name]],
                    'test' = test[[name]])
  
  result <- setNames(nm = names(list_data)) %>% lapply(function(nm){
    
    data <- list_data[[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  data.frame(
    preds_pop_stats_theo[[name]][[nm]],
    premium_results,
    pdx = pdx_results
  )
  }) 
  
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving pop_to_g to %s", Sys.time(), pop_to_g_file))
  toJSON(pop_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(pop_to_g_file)
  
  rm(pop_to_g_theo)
}


pop_stats_path <- 'preds/preds_pop_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(pop_stats_path)) {
  preds_pop_stats <- fromJSON(pop_stats_path)
} else {
  preds_pop_stats <- setNames(nm = names(sims)) %>% 
    lapply(function(name) {
      setNames(nm = names(pop_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- pop_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_pop_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(pop_stats_path)
}
Computing estimated premiums and local metrics for the experiment
sims_to_g_file <- "preds/sims_to_g.json"

# Check if the JSON file exists
if (file.exists(sims_to_g_file)) {
  message(sprintf("[%s] File exists. Reading sims_to_g from %s", Sys.time(), sims_to_g_file))
  sims_to_g <- fromJSON(sims_to_g_file)
} else {
## From the first section of the online supplement
preds_sims_stats_theo <- fromJSON('preds/preds_sims_stats_theo.json')
sims_to_g <- setNames(object = seq_along(sim_samples), 
                      nm = names(sim_samples)) %>% lapply(function(idx) {
  message(sprintf("[%s] Processing: %s", Sys.time(), paste0(idx, '/', length(sim_samples))))
  
  samples_to_ret <- setNames(nm = names(sim_samples[[idx]])) %>% lapply(function(nm){
  data <- preds_sims_stats_theo[[idx]][[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums_sims[[idx]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_sims[[idx]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    data,
    premium_results,
    pdx = pdx_results
  )
    
  })
  
  return(samples_to_ret)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving sims_to_g to %s", Sys.time(), sims_to_g_file))
  toJSON(sims_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(sims_to_g_file)
}

sims_stats_path <- 'preds/preds_sims_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(sims_stats_path)) {
  preds_sims_stats <- fromJSON(sims_stats_path)
} else {
  preds_sims_stats <- setNames(nm = names(sims_to_g)) %>% 
    lapply(function(name) {
      setNames(nm = names(sims_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- sims_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants_sims[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_sims_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(sims_stats_path)
}
R code producing the estimated propensity illustration across scenarios
to_save_pdx_perpop <- names(df_to_g) %>% 
  lapply(function(name){
    cols <- the_CAS_colors
    pop_id <- which(names(df_to_g) == name)
    
    ## keep only axis for last plot
    if(pop_id == 1){ # If it's the last, apply correct xlabels
      the_y_scale <- ylim(0,1)
      the_y_label <- latex2exp::TeX("$\\widehat{P}(D = 1|x_1, x_2)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(labels = NULL, limits = c(0,1))
      the_y_label <- NULL
    }
    
    the_pops <- c('Scenario 1', 'Scenario 2', 'Scenario 3')
    
    ## lets graph
    df_to_g[[name]] %>% 
      mutate(the_population = name) %>% 
  filter(x1 >= -9, x1 <= 11,
         d == 1) %>% 
  group_by(x1, x2) %>% summarise(pdx = mean(pdx)) %>%  ungroup %>% 
  ggplot(aes(x = x1, y = pdx,
             lty = factor(x2),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(x2),
             color = factor(x2))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '31', '21', '11'), name = latex2exp::TeX('$x_2$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$x_2$')) +
  scale_linewidth_manual(values = c(2, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.65, 0.75, 0.85, 0.9), name = latex2exp::TeX('$x_2$')) + 
  labs(x = latex2exp::TeX("$x_1$"),
       y = the_y_label,
       title = latex2exp::TeX(the_pops[pop_id])) + 
  scale_x_continuous(breaks = c(-3:3)*3 + 1)  + # see above
  theme_minimal() + 
  the_y_scale +
  theme(plot.title = element_text(hjust=0.5))
  }) %>% ggpubr::ggarrange(plotlist = .,
                           nrow = 1,
                           widths = 15, heights = 1,
                           common.legend = T,
                           legend = 'right')

if (!dir.exists('figs')) dir.create('figs')
ggsave(filename = "figs/graph_pdx_perpop.png",
       plot = to_save_pdx_perpop,
       height = 3.25,
       width = 7.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.1: Estimated propensity in terms of \(x_1\) and \(x_2\) for simulations
R code producing the estimated best-estimate, unaware, and aware illustration.
to_save_premiumsdense_BUA_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[1:3] %>%  colorspace::darken(0.25)
      # the_CAS_colors_full[c(6, 5, 2)]
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 140))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 140))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[1:3])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[1:3]),
                             labels = labels_for_premiums[1:3])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15), heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_BUA_perpop.png",
       plot = to_save_premiumsdense_BUA_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.2: Estimated best-estimate \(\widehat{\mu}^B\), unaware \(\widehat{\mu}^U\), and aware \(\widehat{\mu}^A\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).
R code producing the estimated aware, hyperaware, corrective illustration.
to_save_premiumsdense_AHC_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[3:5] %>%  colorspace::darken(0.25)
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 130))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 130))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[3:5])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[3:5]),
                             labels = labels_for_premiums[3:5])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  # group_by(x1, d, variable) %>% summarise(value = mean(value)) %>%  ungroup %>% 
  
  ## mask part of the line 
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15),
                            heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_AHC_perpop.png",
       plot = to_save_premiumsdense_AHC_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.3: Estimated aware \(\widehat{\mu}^A\), hyperaware \(\widehat{\mu}^H\), and corrective \(\widehat{\mu}^C\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).